This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing a chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

The code should be executed in order.

install.packages("tidyverse")  
Error in install.packages : Updating loaded packages
library(tidyverse)

read in data

clarks <- read.csv("data/Data\ Science\ Assessment.csv")

summarise data

summary(clarks)
       X             cust.id            age         credit.score   email       distance.to.store   online.visits      online.trans    
 Min.   :     1   Min.   :     1   Min.   :13.54   Min.   :504.4   no :70054   Min.   :   0.0306   Min.   :   0.00   Min.   :  0.000  
 1st Qu.: 25001   1st Qu.: 25001   1st Qu.:31.60   1st Qu.:689.9   yes:29946   1st Qu.:   3.2743   1st Qu.:   0.00   1st Qu.:  0.000  
 Median : 50000   Median : 50000   Median :34.97   Median :724.8               Median :   7.3183   Median :   7.00   Median :  2.000  
 Mean   : 50000   Mean   : 50000   Mean   :34.98   Mean   :724.9               Mean   :  14.9791   Mean   :  30.16   Mean   :  9.048  
 3rd Qu.: 75000   3rd Qu.: 75000   3rd Qu.:38.35   3rd Qu.:759.9               3rd Qu.:  16.4615   3rd Qu.:  34.00   3rd Qu.: 10.000  
 Max.   :100000   Max.   :100000   Max.   :56.33   Max.   :941.2               Max.   :1886.2149   Max.   :1046.00   Max.   :298.000  
                                                                                                                                      
  online.spend      store.trans      store.spend       sat.service    sat.selection  
 Min.   :   0.00   Min.   : 0.000   Min.   :   0.00   Min.   :1.0     Min.   :1.00   
 1st Qu.:   0.00   1st Qu.: 0.000   1st Qu.:   0.00   1st Qu.:3.0     1st Qu.:2.00   
 Median :  40.47   Median : 1.000   Median :  30.29   Median :3.0     Median :3.00   
 Mean   : 182.70   Mean   : 1.319   Mean   :  47.17   Mean   :3.1     Mean   :2.81   
 3rd Qu.: 202.27   3rd Qu.: 2.000   3rd Qu.:  65.83   3rd Qu.:4.0     3rd Qu.:3.00   
 Max.   :6781.67   Max.   :26.000   Max.   :1332.51   Max.   :5.0     Max.   :5.00   
                                                      NA's   :34933   NA's   :34933  

Note: Max age is very young - ignoring the fastest growing demographic in UK (the ageing baby boomers).

Very high extremes for many variables, heavilly skewed data

check for missing data - only missing for the satisfaction score

missing.values <- clarks %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 
attributes are not identical across measure variables;
they will be dropped
missing.values
missing.values %>%
  ggplot() +
    geom_bar(aes(x=key, y=num.missing), stat = 'identity') +
    labs(x='variable', y="number of missing values", title='Number of missing values') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Add some new columns

clarks <-  mutate(clarks,
                  total.spend = online.spend + store.spend,
                  total.trans = online.trans + store.trans) 
 
clarks <-  mutate(clarks,
                  prop.spend.online = online.spend /  total.spend,
                  prop.trans.online = online.trans / total.trans) 
clarks <-  mutate(clarks,
                  online.spend.per.trans = online.spend /  online.trans,
                  store.spend.per.trans = store.spend /  store.trans,
                  total.spend.per.trans = total.spend /  total.trans) 

In store spending accounts for 20% of total

total_online = sum(clarks$online.spend)
total_store = sum(clarks$store.spend)
total_store/(total_online+total_store)
[1] 0.2052013

profile customer spending in store or online

clarks$online.profile <- "Mixed"
clarks$online.profile[clarks$online.spend == 0] <- "Store only"
clarks$online.profile[clarks$store.spend == 0] <- "Online only"
clarks$online.profile[clarks$total.spend == 0] <- "No spend"

Characteristics of online profile including “No Spend” - no correlation with age, credit, satisfaction. Distance is interesting - it looks like the No Spenders and the Online only people live further away.

clarks %>% 
  group_by(online.profile) %>%
  summarise(no_rows = length(cust.id))
clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_spend = mean(total.spend))
clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_age = mean(age))
clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_credit = mean(credit.score))
clarks %>% 
  group_by(online.profile) %>%
  summarise(median.distance = median(distance.to.store))
filter(clarks, sat.service != "NA") %>% 
  group_by(online.profile) %>%
  summarise(mean.sat.ser = mean(sat.service))
filter(clarks, sat.selection != "NA") %>% 
  group_by(online.profile) %>%
  summarise(mean.sat.sel = mean(sat.selection))

Data subset of only customers who actually spend something

clarks.spend <- filter(clarks , online.profile != "No spend")

Are there any duplicate customers who may be same person on and offline?

Check duplicates in distance.to.store? - only one and they have different age so all good.

length(unique(clarks$distance.to.store))
[1] 99999
n_occur <- data.frame(table(clarks$distance.to.store))
clarks[clarks$distance.to.store %in% n_occur$Var1[n_occur$Freq > 1],]

First look at correlations: Plot shows positive correlations in red and negative in blue. Bigger circles with stroger colour indicate stronger corelations.

data.numeric <- mutate(clarks, numeric.email = as.numeric(email=="yes"))
rquery.cormat(data.numeric[,c(3,4,6,7,8,9,10,11,12,13,15)])
$r

$p

$sym
                  age credit.score distance.to.store sat.service sat.selection online.spend online.visits online.trans total.trans store.trans
age               1                                                                                                                           
credit.score          1                                                                                                                       
distance.to.store                  1                                                                                                          
sat.service                                          1                                                                                        
sat.selection                                        .           1                                                                            
online.spend                                                                   1                                                              
online.visits                                                                  B            1                                                 
online.trans                                                                   B            B             1                                   
total.trans                                                                    B            B             1            1                      
store.trans                                                                                                                        1          
store.spend                                                                                                                        +          
                  store.spend
age                          
credit.score                 
distance.to.store            
sat.service                  
sat.selection                
online.spend                 
online.visits                
online.trans                 
total.trans                  
store.trans                  
store.spend       1          
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1

online spend, visits and transactions are strongly positively correlates store transactions and spend also

satisfaction with service and selection positively correlated age and credit score loosely corelated (positive)

weak negative correlation between distance to store and store spend and treansactions

very weak negative correlation between age and online visit/trans/spend (older people slightly less likely to shop online)

total transaction is far more linked to online spend - probably as store spend in only 20% of total

Are number of online and store transactions positively or negatively correlated? -negative in the extremes but no major correlation

rr

 ggplot(data = clarks, aes(x = store.trans, y = online.trans)) +
          geom_point() 

rr

NA

Plot online and store spends - reveals strange gap in online spending around £28

ggplot(data = clarks.spend, aes(x = store.spend, y = online.spend, colour = online.profile)) +
          geom_point() + guides(colour = guide_legend(override.aes = list(alpha = 1))) 

ggplot(data = clarks.spend, aes(x = store.spend, y = online.spend, colour = online.profile)) +
          geom_point() + guides(colour = guide_legend(override.aes = list(alpha = 1))) +
           scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10')

Are transactions and spend correlated? YES

rr

 ggplot(data = clarks, aes(x = total.trans, y = total.spend)) +
          geom_point()

Are prop online and total trans/spend correlated? - Yes, particuarly at the upper end

rr

 
filter(clarks, !is.nan(prop.spend.online)) %>%
ggplot( aes(x = prop.spend.online, y = total.spend)) +
          geom_point()

rr

 
filter(clarks, !is.nan(prop.trans.online)) %>%
ggplot( aes(x = prop.trans.online, y = total.trans)) +
          geom_point()

Online profile

Online only people seem to spend a lot less PER TRANSACTION and within a much smaller range (e.g. for shoes only buy one pair at a time) - incentivise buy one get one 20% off as they add to cart?

clarks.spend %>%
  ggplot( aes(x = online.profile, y = total.spend.per.trans, group = online.profile)) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  

pivot_longer(clarks.spend, cols = store.spend.per.trans:online.spend.per.trans, names_to = "where", values_to = "ave.spend.per.trans") %>%
ggplot(aes(x = where,y = ave.spend.per.trans), group = where) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  

Store only people tend not to spend as much OVERALL - can we get them to be returning customers? Offer in bag with purchase? For returning to store or online? Collect email addresses in store? Do we already ask? How high is take up rate?

clarks.spend %>%
  ggplot( aes(x = online.profile, y = total.spend, group = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
   geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  

select(clarks.spend, store.spend,online.spend)%>%
pivot_longer( cols = 1:2, names_to = "where", values_to = "spend") %>%
ggplot(aes(x = where,y = spend), group = where) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  

Email is slightly lower for store only, but actually quite low throughout - perhaps we can encourage email sharing and collect data to see how successful this is. Competition or offer to get people to sign up?

Look at satisfaction

This is not very correlated to anything other than the two metrics being correlated with each other

clarks.spend %>%
  ggplot( aes(x = online.profile, y = sat.selection, group = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.01 , colour = "blue" ) +
    geom_boxplot(alpha = 0)  

How do they have satisfaction scores for no spend people?

clarks[clarks$sat.service != "NA",] %>%
ggplot(aes(online.profile)) + 
  geom_bar(aes(fill = as.factor(sat.service)), position = "fill")

Age

rr ggplot(data = clarks.spend, aes(x = age, y = total.spend, colour = online.profile)) + geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) + scale_y_continuous(trans=‘log2’)

rr

 ggplot(data = clarks.spend, aes(x = age, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) 

It seems there is little relationship between age and TOTAL SPEND, but old and young spend less PER TRANSACTION. Perhaps those in 30s buy for their whole family at once but can’t budget more overall. Or they have less time to shop so buy all at once.

rr

filter(clarks.spend , online.profile != \Store only\) %>%
 ggplot(aes(x = age, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) 

Credit score

rr

 ggplot(data = clarks.spend, aes(x = credit.score, y = total.spend, colour = online.profile)) +
          geom_point(alpha = 0.1)+ guides(colour = guide_legend(override.aes = list(alpha = 1))) +
 scale_y_continuous(trans='log2')

rr

 ggplot(data = clarks.spend, aes(x = credit.score, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1)+ guides(colour = guide_legend(override.aes = list(alpha = 1))) +
 scale_y_continuous(trans='log2')

distance to store

rr

filter(clarks.spend, distance.to.store < 0.99* max(distance.to.store)) %>%
ggplot( aes(x = distance.to.store, y = total.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) +
         scale_x_continuous(trans='log10') + scale_y_continuous(trans='log2')

Can see what you would expect here - those closer to store more likely to spend in store. those far away more likely to shop online only.

Below are lots more pots which may be of interest in future with different data but did not yield much insight

rr

 ggplot(data = clarks, aes(x = online.visits, y = total.spend,  colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))

rr

 ggplot(data = clarks, aes(x = online.visits, y = online.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))

rr

 ggplot(data = clarks, aes(x = online.visits, y = store.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))

 ggplot(data = clarks, aes(x = store.trans, y = store.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)

 ggplot(data = clarks, aes(x = store.trans, y = online.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)

 ggplot(data = clarks, aes(x = store.trans, y = total.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)

Are older people more satisfied?

clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  group_by(age.group) %>%
  filter(sat.service != "NA") %>%
    summarize(mean.satisfaction = mean(sat.service))

Are older people spending less per transaction? I did look this way, but actually just fewer data points in these categories.

rr

clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c(\<20\,\20s\,\30s\,\40s\,\50s\))) %>%
  ggplot( aes(x = age.group, y = total.spend.per.trans, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

Are older people spending more money?

clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  ggplot( aes(x = age.group, y = total.spend, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

Are older people spending more frequently?

clarks.spend %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  ggplot( aes(x = age.group, y = total.trans, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

# Satisfaction as a input?

clarks %>% 
  group_by(sat.service) %>%
  summarise(no_rows = length(cust.id))
clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_spend = mean(total.spend))
clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_age = mean(age))
clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_credit = mean(credit.score))
clarks %>% 
  group_by(sat.service) %>%
  summarise(median.distance = median(distance.to.store))
filter(clarks, sat.selection != "NA") %>% 
  group_by(sat.service) %>%
  summarise(mean.sat.sel = mean(sat.selection))

There doesn’t appear to be any correlation between satisfaction or service and any other variables, except for satisfaction of selection which you would expect high scorers would score high on both

Are more satisfied people making more purchases?

filter(clarks, sat.service != "NA") %>%
ggplot( aes(x = sat.service, y = total.trans, group = sat.service, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

Are more satisfied people spending more?

rr


filter(clarks, sat.service != \NA\) %>%
ggplot( aes(x = sat.service, y = total.spend, group = sat.service, color = prop.spend.online)) +
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

rr

filter(clarks, sat.selection != \NA\) %>%
ggplot( aes(x = sat.selection, y = total.spend, group = sat.selection, color = prop.spend.online)) +
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  

rr

filter(clarks, sat.service != \NA\) %>%
filter( !is.nan(prop.spend.online)) %>%
ggplot( aes(x = prop.spend.online, y = sat.service)) +
    geom_point(alpha = 0.1 ) 

rr


filter(clarks, sat.service != \NA\) %>%
ggplot( aes(x = sat.service, y =  prop.spend.online, group = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = \red\)  

rr


filter(clarks, sat.selection != \NA\) %>%
ggplot( aes(x = sat.selection, y =  prop.spend.online, group = sat.selection)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = \red\)  

rr clarks%>% group_by(sat.selection)%>% summarize(mean.spend = mean(total.spend))%>% ggplot( aes(x=sat.selection, y=mean.spend)) + geom_col()

rr

clarks%>%
group_by(sat.service)%>%
summarize(mean.spend = mean(total.spend))%>%
  ggplot( aes(x=sat.service, y=mean.spend)) + 
  geom_col()

rr


filter(clarks, sat.selection != \NA\) %>%
ggplot( aes(x = sat.selection)) +
    geom_bar( ) 
  

rr

filter(clarks, sat.selection != \NA\) %>%
ggplot( aes(x = sat.service)) +
    geom_bar( ) 
  

rr

clarks%>%
mutate(no.online.visits = online.visits==0) %>% 
ggplot( aes(x = no.online.visits, y = total.spend, group = no.online.visits, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = \red\)  

rr

clarks.spend%>%
filter(online.profile != \None\) %>%
ggplot( aes(x = online.profile, y = total.spend, group = online.profile, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = \red\)  

rr

clarks.spend%>%
filter(online.profile != \None\) %>%
ggplot( aes(x = online.profile, y = distance.to.store, group = online.profile, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = \red\)  

rr

clarks%>%
filter(store.spend>0)%>%
ggplot( aes(x = distance.to.store, y = store.spend, colour = sat.service)) +
geom_point(alpha=0.1)

rr

clarks%>%
mutate(store = store.spend > 0)%>%
group_by(store) %>%
summarize(mean_spend = mean(online.spend))

rr

clarks%>%
mutate(store = store.spend > 0)%>%
ggplot(aes(x=store, y=total.spend , group=store))+
  geom_violin()
---
title: "R Notebook - Clarks interview assignment"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing a chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Ctrl+Alt+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Ctrl+Shift+K* to preview the HTML file).

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

The code should be executed in order.

```{r}
install.packages("tidyverse")  
install.packages("corrplot")
```


```{r}
library("tidyverse")
library("corrplot")
```

read in data
```{r}
clarks <- read.csv("data/Data\ Science\ Assessment.csv")
```

summarise data
```{r}
summary(clarks)
```

# Note: Max age is very young - ignoring the fastest growing demographic in UK (the ageing baby boomers).
# Very high extremes for many variables, heavilly skewed data



check for missing data - only missing for the satisfaction score
```{r}
missing.values <- clarks %>%
    gather(key = "key", value = "val") %>%
    mutate(is.missing = is.na(val)) %>%
    group_by(key, is.missing) %>%
    summarise(num.missing = n()) %>%
    filter(is.missing==T) %>%
    select(-is.missing) %>%
    arrange(desc(num.missing)) 

missing.values
```
```{r}
missing.values %>%
  ggplot() +
    geom_bar(aes(x=key, y=num.missing), stat = 'identity') +
    labs(x='variable', y="number of missing values", title='Number of missing values') +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```



# Add some new columns
```{r}
clarks <-  mutate(clarks,
                  total.spend = online.spend + store.spend,
                  total.trans = online.trans + store.trans) 
 

clarks <-  mutate(clarks,
                  prop.spend.online = online.spend /  total.spend,
                  prop.trans.online = online.trans / total.trans) 


clarks <-  mutate(clarks,
                  online.spend.per.trans = online.spend /  online.trans,
                  store.spend.per.trans = store.spend /  store.trans,
                  total.spend.per.trans = total.spend /  total.trans) 
```



# In store spending accounts for 20% of total
```{r}
total_online = sum(clarks$online.spend)
total_store = sum(clarks$store.spend)

total_store/(total_online+total_store)
```


# profile customer spending in store or online
```{r}
clarks$online.profile <- "Mixed"
clarks$online.profile[clarks$online.spend == 0] <- "Store only"
clarks$online.profile[clarks$store.spend == 0] <- "Online only"
clarks$online.profile[clarks$total.spend == 0] <- "No spend"
```



#Characteristics of online profile including "No Spend"  - no correlation with age, credit, satisfaction. Distance is interesting - it looks like the No Spenders and the Online only people live further away.
```{r}
clarks %>% 
  group_by(online.profile) %>%
  summarise(no_rows = length(cust.id))

clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_spend = mean(total.spend))

clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_age = mean(age))

clarks %>% 
  group_by(online.profile) %>%
  summarise(mean_credit = mean(credit.score))

clarks %>% 
  group_by(online.profile) %>%
  summarise(median.distance = median(distance.to.store))

filter(clarks, sat.service != "NA") %>% 
  group_by(online.profile) %>%
  summarise(mean.sat.ser = mean(sat.service))

filter(clarks, sat.selection != "NA") %>% 
  group_by(online.profile) %>%
  summarise(mean.sat.sel = mean(sat.selection))

```


# Data subset of only customers who actually spend something
```{r}
clarks.spend <- filter(clarks , online.profile != "No spend")
```



#Are there any duplicate customers who may be same person on and offline?
#Check duplicates in distance.to.store? - only one and they have different age so all good.
```{r}
length(unique(clarks$distance.to.store))

n_occur <- data.frame(table(clarks$distance.to.store))
clarks[clarks$distance.to.store %in% n_occur$Var1[n_occur$Freq > 1],]

```


First look at correlations:
Plot shows positive correlations in red and negative in blue. Bigger circles with stroger colour indicate stronger corelations.
```{r}
data.numeric <- mutate(clarks, numeric.email = as.numeric(email=="yes"))
rquery.cormat(data.numeric[,c(3,4,6,7,8,9,10,11,12,13,15)])
```

online spend, visits and transactions are strongly positively correlates
store transactions and spend also

satisfaction with service and selection positively correlated
age and credit score loosely corelated (positive)

weak negative correlation between distance to store and store spend and treansactions

very weak negative correlation between age and online visit/trans/spend (older people slightly less likely to shop online)

total transaction is far more linked to online spend - probably as store spend in only 20% of total




# Are number of online and store transactions positively or negatively correlated? -negative in the extremes but no major correlation
```{r}
 ggplot(data = clarks, aes(x = store.trans, y = online.trans)) +
          geom_point() 
        
    ```



Plot online and store spends - reveals strange gap in online spending around £28

```{r}
ggplot(data = clarks.spend, aes(x = store.spend, y = online.spend, colour = online.profile)) +
          geom_point() + guides(colour = guide_legend(override.aes = list(alpha = 1))) 
```

```{r}
ggplot(data = clarks.spend, aes(x = store.spend, y = online.spend, colour = online.profile)) +
          geom_point() + guides(colour = guide_legend(override.aes = list(alpha = 1))) +
           scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10')
```


Are transactions and spend correlated?  YES
```{r}
 ggplot(data = clarks, aes(x = total.trans, y = total.spend)) +
          geom_point()
```


Are prop online and total trans/spend correlated?  - Yes, particuarly at the upper end
```{r}
 
filter(clarks, !is.nan(prop.spend.online)) %>%
ggplot( aes(x = prop.spend.online, y = total.spend)) +
          geom_point()
```
```{r}
 
filter(clarks, !is.nan(prop.trans.online)) %>%
ggplot( aes(x = prop.trans.online, y = total.trans)) +
          geom_point()
```




# Online profile
Online only people seem to spend a lot less PER TRANSACTION and within a much smaller range (e.g. for shoes only buy one pair at a time) - incentivise buy one get one 20% off as they add to cart?

```{r}
clarks.spend %>%
  ggplot( aes(x = online.profile, y = total.spend.per.trans, group = online.profile)) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  
```


```{r}
pivot_longer(clarks.spend, cols = store.spend.per.trans:online.spend.per.trans, names_to = "where", values_to = "ave.spend.per.trans") %>%
ggplot(aes(x = where,y = ave.spend.per.trans), group = where) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  
```



Store only people tend not to spend as much OVERALL - can we get them to be returning customers? Offer in bag with purchase? For returning to store or online? Collect email addresses in store? Do we already ask? How high is take up rate?

```{r}
clarks.spend %>%
  ggplot( aes(x = online.profile, y = total.spend, group = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
   geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  
```

```{r}

select(clarks.spend, store.spend,online.spend)%>%
pivot_longer( cols = 1:2, names_to = "where", values_to = "spend") %>%
ggplot(aes(x = where,y = spend), group = where) + 
    geom_jitter( alpha=0.05 , colour = "red" )   +
    geom_boxplot( colour = "blue" , alpha=0)  
```



Email is slightly lower for store only, but actually quite low throughout - perhaps we can encourage email sharing and collect data to see how successful this is. Competition or offer to get people to sign up?

```{r}
clarks.spend %>%
ggplot(aes(online.profile)) + 
  geom_bar(aes(fill = as.factor(email)), position = "fill")
```





# Look at satisfaction 
This is not very correlated to anything other than the two metrics being correlated with each other

```{r}
clarks.spend %>%
  ggplot( aes(x = online.profile, y = sat.selection, group = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.01 , colour = "blue" ) +
    geom_boxplot(alpha = 0)  
```

How do they have satisfaction scores for no spend people?
```{r}
clarks[clarks$sat.service != "NA",] %>%
ggplot(aes(online.profile)) + 
  geom_bar(aes(fill = as.factor(sat.service)), position = "fill")
```

# Age
```{r}
 ggplot(data = clarks.spend, aes(x = age, y = total.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) +
        scale_y_continuous(trans='log2') 
```


```{r}
 ggplot(data = clarks.spend, aes(x = age, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) 
```
It seems there is little relationship between age and TOTAL SPEND, but old and young spend less PER TRANSACTION. Perhaps those in 30s buy for their whole family at once but can't budget more overall. Or they have less time to shop so buy all at once.




```{r}

filter(clarks.spend , online.profile != "Store only") %>%
 ggplot(aes(x = age, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) 
```






# Credit score

```{r}
 ggplot(data = clarks.spend, aes(x = credit.score, y = total.spend, colour = online.profile)) +
          geom_point(alpha = 0.1)+ guides(colour = guide_legend(override.aes = list(alpha = 1))) +
 scale_y_continuous(trans='log2')
```

```{r}
 ggplot(data = clarks.spend, aes(x = credit.score, y = total.spend.per.trans, colour = online.profile)) +
          geom_point(alpha = 0.1)+ guides(colour = guide_legend(override.aes = list(alpha = 1))) +
 scale_y_continuous(trans='log2')
```

# distance to store

```{r}
filter(clarks.spend, distance.to.store < 0.99* max(distance.to.store)) %>%

ggplot( aes(x = distance.to.store, y = total.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1))) +
         scale_x_continuous(trans='log10') + scale_y_continuous(trans='log2')
```

Can see what you would expect here - those closer to store more likely to spend in store. those far away more likely to shop online only.




# Below are lots more pots which may be of interest in future with different data but did not yield much insight

```{r}
 ggplot(data = clarks, aes(x = online.visits, y = total.spend,  colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))
```


```{r}
 ggplot(data = clarks, aes(x = online.visits, y = online.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))
```
```{r}
 ggplot(data = clarks, aes(x = online.visits, y = store.spend, colour = online.profile)) +
          geom_point(alpha = 0.1) + guides(colour = guide_legend(override.aes = list(alpha = 1)))
```






```{r}
 ggplot(data = clarks, aes(x = store.trans, y = store.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)
```

```{r}
 ggplot(data = clarks, aes(x = store.trans, y = online.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)
```


```{r}
 ggplot(data = clarks, aes(x = store.trans, y = total.spend, colour = prop.spend.online)) +
          geom_point(alpha = 0.1)
```





 Are older people more satisfied? 
 
```{r}
clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  group_by(age.group) %>%
  filter(sat.service != "NA") %>%
    summarize(mean.satisfaction = mean(sat.service))

```
 

 
  Are older people spending less per transaction? I did look this way, but actually just fewer data points in these categories.
 
```{r}
clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  ggplot( aes(x = age.group, y = total.spend.per.trans, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```
 
  Are older people spending more money?
 
```{r}
clarks %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  ggplot( aes(x = age.group, y = total.spend, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```
 
  Are older people spending more frequently?
 
```{r}
clarks.spend %>%
mutate(age.group=cut(age, breaks=c(0, 20, 30, 40, 50, 100), labels=c("<20","20s","30s","40s","50s"))) %>%
  ggplot( aes(x = age.group, y = total.trans, group = age.group, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```


 # Satisfaction as a input? 
 
```{r}
clarks %>% 
  group_by(sat.service) %>%
  summarise(no_rows = length(cust.id))

clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_spend = mean(total.spend))

clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_age = mean(age))

clarks %>% 
  group_by(sat.service) %>%
  summarise(mean_credit = mean(credit.score))

clarks %>% 
  group_by(sat.service) %>%
  summarise(median.distance = median(distance.to.store))

filter(clarks, sat.selection != "NA") %>% 
  group_by(sat.service) %>%
  summarise(mean.sat.sel = mean(sat.selection))
```

There doesn't appear to be any correlation between satisfaction or service and any other variables, except for satisfaction of selection which you would expect high scorers would score high on both
 
 
 Are more satisfied people making more purchases?
```{r}

filter(clarks, sat.service != "NA") %>%
ggplot( aes(x = sat.service, y = total.trans, group = sat.service, colour = online.profile)) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))+
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```
 
 
 
   Are more satisfied people spending more?
```{r}

filter(clarks, sat.service != "NA") %>%
ggplot( aes(x = sat.service, y = total.spend, group = sat.service, color = prop.spend.online)) +
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```


```{r}
filter(clarks, sat.selection != "NA") %>%
ggplot( aes(x = sat.selection, y = total.spend, group = sat.selection, color = prop.spend.online)) +
    geom_jitter(alpha = 0.1 ) +
    geom_boxplot(alpha = 0)  
```



```{r}
filter(clarks, sat.service != "NA") %>%
filter( !is.nan(prop.spend.online)) %>%
ggplot( aes(x = prop.spend.online, y = sat.service)) +
    geom_point(alpha = 0.1 ) 

```
```{r}

filter(clarks, sat.service != "NA") %>%
ggplot( aes(x = sat.service, y =  prop.spend.online, group = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = "red")  
```

```{r}

filter(clarks, sat.selection != "NA") %>%
ggplot( aes(x = sat.selection, y =  prop.spend.online, group = sat.selection)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = "red")  
```



```{r}
clarks%>%
group_by(sat.selection)%>%
summarize(mean.spend = mean(total.spend))%>%
  ggplot( aes(x=sat.selection, y=mean.spend)) + 
  geom_col()
```


```{r}
clarks%>%
group_by(sat.service)%>%
summarize(mean.spend = mean(total.spend))%>%
  ggplot( aes(x=sat.service, y=mean.spend)) + 
  geom_col()
```



```{r}

filter(clarks, sat.selection != "NA") %>%
ggplot( aes(x = sat.selection)) +
    geom_bar( ) 
  
```


```{r}
filter(clarks, sat.selection != "NA") %>%
ggplot( aes(x = sat.service)) +
    geom_bar( ) 
  
```

```{r}
clarks%>%
mutate(no.online.visits = online.visits==0) %>% 
ggplot( aes(x = no.online.visits, y = total.spend, group = no.online.visits, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = "red")  
```

```{r}
clarks.spend%>%
filter(online.profile != "None") %>%
ggplot( aes(x = online.profile, y = total.spend, group = online.profile, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = "red")  
```



```{r}
clarks.spend%>%
filter(online.profile != "None") %>%
ggplot( aes(x = online.profile, y = distance.to.store, group = online.profile, colour = sat.service)) +
    geom_jitter(alpha = 0.05 ) +
    geom_violin(alpha = 0, colour = "red")  
```




```{r}
clarks%>%
filter(store.spend>0)%>%
ggplot( aes(x = distance.to.store, y = store.spend, colour = sat.service)) +
geom_point(alpha=0.1)
```



```{r}
clarks%>%
mutate(store = store.spend > 0)%>%
group_by(store) %>%
summarize(mean_spend = mean(online.spend))

```

```{r}
clarks%>%
mutate(store = store.spend > 0)%>%
ggplot(aes(x=store, y=total.spend , group=store))+
  geom_violin()

```




